#include <iostream>
#include <unistd.h>

#include "opencv2/opencv_modules.hpp"


#include "opencv2/core.hpp"
#include "opencv2/features2d.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/cudafeatures2d.hpp"
#include "opencv2/xfeatures2d/cuda.hpp"

using namespace std;
using namespace cv;

static void help()
{
    cout << "\nUsage:\n\t./main <image> " << endl;
}

int main(int argc, char* argv[])
{
    if (argc != 2)
    {
        help();
        return -1;
    }

    cv::cuda::GpuMat img1;

    img1.upload(imread(argv[1], IMREAD_GRAYSCALE));
    cv::Mat img1_cpu = imread(argv[1], IMREAD_GRAYSCALE);
    // for (int i = 1; i < argc; ++i)
    // {
    //     if (string(argv[i]) == "--left")
    //     {
    //         img1.upload(imread(argv[++i], IMREAD_GRAYSCALE));
    //         CV_Assert(!img1.empty());
    //     }
    //     else if (string(argv[i]) == "--right")
    //     {
    //         img2.upload(imread(argv[++i], IMREAD_GRAYSCALE));
    //         CV_Assert(!img2.empty());
    //     }
    //     else if (string(argv[i]) == "--help")
    //     {
    //         help();
    //         return -1;
    //     }
    // }


    cv::cuda::printShortCudaDeviceInfo(cv::cuda::getDevice());

    /* https://docs.opencv.org/4.5.5/db/d06/classcv_1_1cuda_1_1SURF__CUDA.html
    cv::cuda::SURF_CUDA::SURF_CUDA(
        double _hessianThreshold,                           // 默认100,图像Hessian矩阵判别式的阈值，越高监测的点越少
        int     _nOctaves = 4,                              // 默认4，金字塔组数            
        int     _nOctaveLayers = 2,                         // 默认3，每组金子塔的层数
        bool    _extended = false,                          // 默认False，扩展描述符标志，True表示使用扩展的128个元素描述符，False表示使用64个元素描述符
        float   _keypointsRatio = 0.01f,                    // ---
        bool    _upright = false                            // 默认False，垂直向上或旋转的特征标志，True表示不计算特征的方向，False-计算方向
    ) 
    */
    // cv::cuda::SURF_CUDA surf(250);

    /* https://docs.opencv.org/4.5.5/da/d44/classcv_1_1cuda_1_1ORB.html
    static Ptr<cuda::ORB> cv::cuda::ORB::create(
        int     nfeatures = 500,                            // 要检测的最大关键点数量
        float   scaleFactor = 1.2f,                         // 金字塔缩放比例
        int     nlevels = 8,                                // 金字塔层数
        int     edgeThreshold = 31,                         // 这是未检测到特征的边界的大小，它应该与patchSize参数大致匹配
        int     firstLevel = 0,                             // 原图像所在的金字塔层数
        int     WTA_K = 2,                                  // 生成定向BRIEF描述符的每个元素的点数
        int     scoreType = cv::ORB::HARRIS_SCORE,          // 默认的HARRIS_SCORE表示哈里斯算法用于对特征进行排序
        int     patchSize = 31,                             // 计算BRIEF描述符使用的像素区域大小
        int     fastThreshold = 20,                         // fast特征点的阈值
        bool    blurForDescriptor = false                   // 计算描述子时是否对图像模糊处理
    )
    */
    Ptr<cv::cuda::ORB> gpuORBDetector = cv::cuda::ORB::create(5000, 1.2f, 8, 31, 0, 2, 0, 31, 20,true);

    /* https://docs.opencv.org/4.5.5/d4/d6a/classcv_1_1cuda_1_1FastFeatureDetector.html
    static Ptr<cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(
        int     threshold = 10,                             // fast特征点的阈值
        bool    nonmaxSuppression = true,                   // 是否使用非极大值抑制
        int     type = cv::FastFeatureDetector::TYPE_9_16,  // fast特征点的种类
        int     max_npoints = 5000                          // 要检测的最大关键点数量
        )
    */
    Ptr<cv::cuda::FastFeatureDetector> gpuFastDetector = cv::cuda::FastFeatureDetector::create(37, true, cv::FastFeatureDetector::TYPE_9_16, 500);
    


    Ptr<cv::FastFeatureDetector> cpu_fast = cv::FastFeatureDetector::create(37, true, cv::FastFeatureDetector::TYPE_9_16);
    vector<KeyPoint> fastkeyPoints_cpu;
    cpu_fast -> detect(img1_cpu, fastkeyPoints_cpu);


    Ptr<ORB> cpu_orb = ORB::create(5000, 1.2f, 8, 31, 0, 2, ORB::HARRIS_SCORE, 31, 20);
    Mat orbcpu_descriptors;
    vector<KeyPoint> orbkeyPoints_cpu;
    cpu_orb -> detectAndCompute(img1_cpu, Mat(), orbkeyPoints_cpu, orbcpu_descriptors);



    cv::cuda::GpuMat keypoints1GPU, keypoints2GPU;
    cv::cuda::GpuMat descriptors1GPU, descriptors2GPU;
    vector<KeyPoint> gpuSURFKeypoints, gpuFASTKeypoints, gpuORBKeypoints;
    vector<float> SURFdescriptors, ORBdescriptors;



    // detecting keypoints & computing descriptors
    // operator()方法，https://docs.opencv.org/4.5.5/db/d06/classcv_1_1cuda_1_1SURF__CUDA.html
    // surf(img1, cv::cuda::GpuMat(), keypoints1GPU, descriptors1GPU);
    // std::cout << "keypoints1GPU :  \n\r  rows " << keypoints1GPU.rows << "\n\r  cols " << keypoints1GPU.cols << "\n\r  type " << keypoints1GPU.type() << std::endl;
    // std::cout << "descriptors1GPU :  \n\r  rows " << descriptors1GPU.rows << "\n\r  cols " << descriptors1GPU.cols << "\n\r  type " << descriptors1GPU.type() << std::endl;
    
    // https://docs.opencv.org/4.5.5/d0/d13/classcv_1_1Feature2D.html#ab3cce8d56f4fc5e1d530b5931e1e8dc0
    gpuFastDetector->detect(img1, gpuFASTKeypoints);

    // https://docs.opencv.org/4.5.5/df/db9/classcv_1_1cuda_1_1Feature2DAsync.html#a56c6c75e25e9616934c25552164a363c
    gpuORBDetector -> detectAndComputeAsync(img1, cv::cuda::GpuMat(), keypoints2GPU, descriptors2GPU);
    std::cout << "keypoints2GPU :  \n\r  rows " << keypoints2GPU.rows << "\n\r  cols " << keypoints2GPU.cols << "\n\r  type " << keypoints2GPU.type() << std::endl;
    std::cout << "descriptors2GPU :  \n\r  rows " << descriptors2GPU.rows << "\n\r  cols " << descriptors2GPU.cols << "\n\r  type " << descriptors2GPU.type() << std::endl;
    

    sleep(5);
    while (1)
    {
        int64 t1 = cv::getTickCount();
        cpu_fast -> detect(img1_cpu, fastkeyPoints_cpu);
        // // detecting keypoints & computing descriptors
        // surf(img1, cv::cuda::GpuMat(), keypoints1GPU, descriptors1GPU);
        // // downloading results
        // surf.downloadKeypoints(keypoints1GPU, gpuSURFKeypoints);
        // surf.downloadDescriptors(descriptors1GPU, SURFdescriptors);
        int64 t2 = cv::getTickCount();



        int64 t3 = cv::getTickCount();
        gpuFastDetector->detect(img1, gpuFASTKeypoints);
        int64 t4 = cv::getTickCount();

        int64 t5 = cv::getTickCount();
        // detecting keypoints & computing descriptors
        gpuORBDetector->detectAndComputeAsync(img1, cv::cuda::GpuMat(), keypoints2GPU, descriptors2GPU);
        // downloading results
        gpuORBDetector->convert(keypoints2GPU, gpuORBKeypoints);
        

        int64 t6 = cv::getTickCount();


        cpu_orb -> detectAndCompute(img1_cpu, Mat(), orbkeyPoints_cpu, orbcpu_descriptors);
        int64 t7 = cv::getTickCount();


        // std::cout << "gpuSURFKeypoints.size():  " << gpuSURFKeypoints.size() << std::endl;
        std::cout << "gpuFASTKeypoints.size():  " << gpuFASTKeypoints.size() << std::endl;
        std::cout << "gpuORBKeypoints.size() :  " << gpuORBKeypoints.size() << std::endl;
        std::cout << "orbkeyPoints_cpu.size() :  " << orbkeyPoints_cpu.size() << std::endl;
        std::cout << "fastkeyPoints_cpu.size():  " << fastkeyPoints_cpu.size() << std::endl;

        // float t_surf = 1000 * (t2 - t1) / cv::getTickFrequency();
        float t_Fast = 1000 * (t4 - t3) / cv::getTickFrequency();
        float t_ORB  = 1000 * (t6 - t5) / cv::getTickFrequency();
        float cpu_ORB  = 1000 * (t7 - t6) / cv::getTickFrequency();
        float cpu_Fast = 1000 * (t2 - t1) / cv::getTickFrequency();
        std::cout << "t_Fast  " << t_Fast << std::endl ;
        // std::cout << "t_surf  " << t_surf << std::endl ;
        std::cout << "t_ORB   " << t_ORB << std::endl ;
        std::cout << "cpu_ORB   " << cpu_ORB << std::endl ;
        std::cout << "cpu_Fast  " << cpu_Fast << std::endl ;

        // waitKey(0);

    }
    return 0;
}

